From https://www.linkedin.com/learning/data-science-foundations-data-mining/anomaly-detection-in-r (Linkedin Premium)
Add IADB ML course
# # I am executing bc it for the blog post
# knitr::opts_chunk$set(eval = TRUE,
# echo = TRUE,
# tidy = FALSE,
# results='hide',
# message = FALSE,
# warning = FALSE , fig.show='asis', fig.align='center',
# fig.width=6, fig.height=6)
if (!require("PerformanceAnalytics")) install.packages("PerformanceAnalytics")
if (!require("ggcorrplot")) install.packages("ggcorrplot")
if (!require("GGally")) install.packages("GGally") # Ext to ggplot2
if (!require("ggpubr")) install.packages("ggpubr")
if (!require("kableExtra")) install.packages("kableExtra")
if (!require("pander")) install.packages("pander")
DEFINITION:
EXMAPLES:
ALGORITHMs: + Principal Component Analysis (PCA) + Linear Discriminant Analysis (LDA) + Generalized Discriminant Analysis (GDA)
From “psych” package documentation (p. 213) “The primary empirical difference between a components versus a factor model is the treatment of the variances for each item. Philosophically, components are weighted composites of observed variables while in the factor model, variables are weighted composites of the factors.”
# Conducting a principal components/factor analysis
# Load data
data(mtcars)
mtcars[1:5, ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
mtcars1 <- mtcars[, c(1:4, 6:7, 9:11)] # Select variables
mtcars1[1:5, ]
## mpg cyl disp hp wt qsec am gear carb
## Mazda RX4 21.0 6 160 110 2.620 16.46 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 2.875 17.02 1 4 4
## Datsun 710 22.8 4 108 93 2.320 18.61 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.215 19.44 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.440 17.02 0 3 2
# ========= Principle components model using default method
# If using entire data frame:
pc <- prcomp(mtcars1,
center = TRUE, # Centers means to 0 (optional)
scale = TRUE) # Sets unit variance (helpful)
# Or specify variables:
# pc <- prcomp(~ mpg + cyl + disp + hp + wt + qsec + am +
# gear + carb, data = mtcars, scale = TRUE)
# ?prcomp # Generally preferred
# ?princomp # Very slightly different method, similar to S
# Get summary stats
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3391 1.5299 0.71836 0.46491 0.38903 0.35099
## Proportion of Variance 0.6079 0.2601 0.05734 0.02402 0.01682 0.01369
## Cumulative Proportion 0.6079 0.8680 0.92537 0.94939 0.96620 0.97989
## PC7 PC8 PC9
## Standard deviation 0.31714 0.24070 0.1499
## Proportion of Variance 0.01118 0.00644 0.0025
## Cumulative Proportion 0.99107 0.99750 1.0000
# Screeplot
plot(pc)
# Get standard deviations and how variables load on PCs
pc
## Standard deviations (1, .., p=9):
## [1] 2.3391410 1.5299383 0.7183646 0.4649052 0.3890348 0.3509911 0.3171373
## [8] 0.2406989 0.1498962
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5
## mpg -0.4023287 0.02205294 -0.17272803 -0.1366169 0.31654561
## cyl 0.4068870 0.03589482 -0.27747610 0.1410976 0.02066646
## disp 0.4046964 -0.06479590 -0.17669890 -0.5089434 0.21525777
## hp 0.3699702 0.26518848 -0.01046827 -0.1273173 0.42166543
## wt 0.3850686 -0.15955242 0.33740464 -0.4469327 -0.21141143
## qsec -0.2168575 -0.48343885 0.54815205 -0.2545226 0.05466817
## am -0.2594512 0.46039449 -0.19492256 -0.5354196 -0.55331460
## gear -0.2195660 0.50608232 0.34579810 -0.1799814 0.50533262
## carb 0.2471604 0.44322600 0.53847588 0.3203064 -0.25696817
## PC6 PC7 PC8 PC9
## mpg -0.718609897 0.3633216 -0.1487806 0.13567069
## cyl -0.214224005 0.2099893 0.7951724 0.11635839
## disp 0.010052074 0.2007152 -0.1346748 -0.66099594
## hp -0.254229405 -0.6741641 -0.1210386 0.25474680
## wt 0.002897706 0.3392809 -0.1598333 0.57211273
## qsec -0.226660704 -0.2986852 0.4144075 -0.19671599
## am -0.087616182 -0.2135605 0.1897463 -0.02465169
## gear 0.393990378 0.2484622 0.2614819 0.05482771
## carb -0.398353829 0.1321064 -0.1054553 -0.31083546
# See how cases load on PCs
predict(pc)
## PC1 PC2 PC3 PC4
## Mazda RX4 -0.81883768 1.45577333 -0.21204263 0.315888300
## Mazda RX4 Wag -0.78644303 1.26268953 0.04767210 0.119647855
## Datsun 710 -2.49423117 0.02762658 -0.32023017 -0.401948370
## Hornet 4 Drive -0.29454234 -1.92903945 -0.32211475 -0.069818183
## Hornet Sportabout 1.56041411 -0.80821419 -1.04219408 0.050065675
## Valiant -0.20722532 -2.19417266 0.14402455 -0.073226863
## Duster 360 2.73226603 0.29328994 -0.57716172 0.525124977
## Merc 240D -1.79527743 -1.27281225 1.03388048 0.136366170
## Merc 230 -1.89734058 -1.92598643 1.95890184 -0.259206293
## Merc 280 0.01565012 -0.05866208 1.06454809 0.737712361
## Merc 280C 0.03629307 -0.22610850 1.28872352 0.683986341
## Merc 450SE 1.82083345 -0.68439747 -0.18980574 0.295092091
## Merc 450SL 1.60267678 -0.67977004 -0.27149159 0.401507010
## Merc 450SLC 1.71399687 -0.80382315 -0.07136381 0.369296647
## Cadillac Fleetwood 3.54393557 -0.78715158 0.61681226 -0.844299902
## Lincoln Continental 3.64660694 -0.72728678 0.64331413 -0.870281313
## Chrysler Imperial 3.39264826 -0.52198151 0.39635946 -0.820419326
## Fiat 128 -3.52803830 -0.23945546 -0.32703554 -0.516783758
## Honda Civic -3.44178368 0.32746057 -0.42306580 0.167700576
## Toyota Corolla -3.85421097 -0.29067456 -0.35299640 -0.412244409
## Toyota Corona -1.64164478 -1.97896631 0.10056967 0.621710410
## Dodge Challenger 1.55167305 -0.86712498 -0.90521454 0.326318496
## AMC Javelin 1.44035057 -0.96337487 -0.77406360 0.368187375
## Camaro Z28 2.92480902 0.36716333 -0.57304474 0.526775004
## Pontiac Firebird 1.81339410 -0.90145453 -0.96469148 -0.314790674
## Fiat X1-9 -3.22172493 -0.06085364 -0.44753150 -0.200178011
## Porsche 914-2 -2.66209565 1.53159161 -0.27507492 -0.212645194
## Lotus Europa -3.19041442 1.69409211 -0.52346685 0.008155493
## Ford Pantera L 1.59533098 3.09923346 -0.61246644 -0.694517979
## Ferrari Dino -0.24630742 3.18027405 0.72936287 0.507145572
## Maserati Bora 2.62596044 4.40241877 0.97303537 -0.006628448
## Volvo 142E -1.93672169 0.27969720 0.18785195 -0.463691632
## PC5 PC6 PC7 PC8
## Mazda RX4 -0.84958691 -0.01150126 0.2522589117 0.070182163
## Mazda RX4 Wag -0.88755160 -0.08177799 0.2470770970 0.158396118
## Datsun 710 -0.36518038 0.53888511 -0.5002113448 -0.034749407
## Hornet 4 Drive 0.20547103 -0.04600804 -0.0108490874 0.008913676
## Hornet Sportabout 0.38197028 -0.13573066 0.1519111982 0.077208430
## Valiant -0.08498911 0.26511187 -0.2594831624 0.275929962
## Duster 360 0.19900274 -0.21386156 -0.3957363363 -0.363215758
## Merc 240D 0.39973745 0.22142233 0.5428418769 -0.326885871
## Merc 230 0.60577005 -0.07860918 -0.3862488169 0.339834557
## Merc 280 0.13700873 0.10015509 0.4329982539 -0.004088557
## Merc 280C 0.08183421 0.19097540 0.2483130414 0.169616855
## Merc 450SE -0.13790858 -0.17982680 0.0644632164 0.136576953
## Merc 450SL -0.01105796 -0.31351178 -0.0326072369 0.216281162
## Merc 450SLC -0.11991960 -0.11371190 -0.2087231635 0.352717370
## Cadillac Fleetwood -0.35483328 0.14208110 0.1686960199 -0.096175655
## Lincoln Continental -0.35666482 0.12483822 0.1380129217 -0.166318506
## Chrysler Imperial -0.06847485 -0.39460143 0.2568141030 -0.357074479
## Fiat 128 -0.02567396 -0.61745094 0.1111807743 0.026812599
## Honda Civic -0.28378711 -0.45517710 0.1611472382 -0.085881908
## Toyota Corolla 0.12577796 -0.84883188 0.0006918617 0.159151765
## Toyota Corona 0.04761048 0.14446951 -0.6908183162 -0.456547134
## Dodge Challenger -0.03467077 0.35437059 0.1896203162 0.198121231
## AMC Javelin -0.04322194 0.33421087 0.0475150953 0.334345388
## Camaro Z28 0.05762007 -0.04009785 -0.3067172410 -0.471489439
## Pontiac Firebird 0.39111452 -0.19470872 0.3822511126 -0.037799957
## Fiat X1-9 -0.25319420 0.06217622 -0.1923901293 0.063485192
## Porsche 914-2 0.31823141 0.69486607 0.4076663225 -0.248004690
## Lotus Europa 0.78245261 0.05939704 0.1649363550 -0.219274124
## Ford Pantera L 0.68539841 0.59731381 -0.1758978468 0.419647806
## Ferrari Dino -0.23921602 0.06422736 0.2232814502 -0.019483666
## Maserati Bora 0.27345257 -0.57263382 -0.5540820690 0.065079511
## Volvo 142E -0.57652141 0.40354034 -0.4779124155 -0.185311588
## PC9
## Mazda RX4 -0.181855680
## Mazda RX4 Wag -0.094402631
## Datsun 710 0.107758386
## Hornet 4 Drive -0.123239037
## Hornet Sportabout -0.150672423
## Valiant 0.017282559
## Duster 360 -0.168608127
## Merc 240D 0.034834992
## Merc 230 -0.189739133
## Merc 280 0.111701775
## Merc 280C 0.014135709
## Merc 450SE 0.399280466
## Merc 450SL 0.198722029
## Merc 450SLC 0.136650969
## Cadillac Fleetwood -0.255615988
## Lincoln Continental -0.035108813
## Chrysler Imperial 0.221926989
## Fiat 128 0.214967700
## Honda Civic -0.295988897
## Toyota Corolla 0.024795528
## Toyota Corona -0.065421142
## Dodge Challenger -0.028308674
## AMC Javelin -0.057433273
## Camaro Z28 0.067421709
## Pontiac Firebird -0.119242525
## Fiat X1-9 0.006363979
## Porsche 914-2 0.093645749
## Lotus Europa 0.020202599
## Ford Pantera L -0.003396395
## Ferrari Dino -0.006799694
## Maserati Bora -0.037841219
## Volvo 142E 0.143982515
# Biplot
biplot(pc)
# =========== Factor Analysis
# Varimax rotation by default
# Gives chi square test that number of factors
# is sufficient to match data (want p > .05).
# Also gives uniqueness values for variables,
# variable loadings on factors, and variance
# statistics.
factanal(mtcars1, 1)
##
## Call:
## factanal(x = mtcars1, factors = 1)
##
## Uniquenesses:
## mpg cyl disp hp wt qsec am gear carb
## 0.162 0.121 0.086 0.308 0.198 0.775 0.642 0.723 0.737
##
## Loadings:
## Factor1
## mpg -0.915
## cyl 0.937
## disp 0.956
## hp 0.832
## wt 0.896
## qsec -0.475
## am -0.599
## gear -0.527
## carb 0.513
##
## Factor1
## SS loadings 5.249
## Proportion Var 0.583
##
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 146.3 on 27 degrees of freedom.
## The p-value is 2.41e-18
factanal(mtcars1, 2)
##
## Call:
## factanal(x = mtcars1, factors = 2)
##
## Uniquenesses:
## mpg cyl disp hp wt qsec am gear carb
## 0.156 0.106 0.090 0.083 0.158 0.261 0.202 0.195 0.302
##
## Loadings:
## Factor1 Factor2
## mpg -0.700 -0.595
## cyl 0.658 0.679
## disp 0.759 0.577
## hp 0.355 0.889
## wt 0.815 0.422
## qsec 0.105 -0.853
## am -0.888
## gear -0.875 0.198
## carb 0.835
##
## Factor1 Factor2
## SS loadings 3.857 3.590
## Proportion Var 0.429 0.399
## Cumulative Var 0.429 0.827
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 56.6 on 19 degrees of freedom.
## The p-value is 1.32e-05
factanal(mtcars1, 3)
##
## Call:
## factanal(x = mtcars1, factors = 3)
##
## Uniquenesses:
## mpg cyl disp hp wt qsec am gear carb
## 0.152 0.049 0.080 0.118 0.005 0.125 0.244 0.099 0.196
##
## Loadings:
## Factor1 Factor2 Factor3
## mpg 0.610 -0.538 -0.431
## cyl -0.614 0.733 0.192
## disp -0.703 0.558 0.339
## hp -0.263 0.820 0.375
## wt -0.738 0.292 0.604
## qsec -0.149 -0.923
## am 0.857 -0.132
## gear 0.936 0.139
## carb 0.153 0.678 0.567
##
## Factor1 Factor2 Factor3
## SS loadings 3.513 3.218 1.202
## Proportion Var 0.390 0.358 0.134
## Cumulative Var 0.390 0.748 0.881
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 24.56 on 12 degrees of freedom.
## The p-value is 0.017
factanal(mtcars1, 4) # First w/p > .05
##
## Call:
## factanal(x = mtcars1, factors = 4)
##
## Uniquenesses:
## mpg cyl disp hp wt qsec am gear carb
## 0.137 0.045 0.005 0.108 0.038 0.101 0.189 0.126 0.031
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## mpg 0.636 -0.445 -0.453 -0.234
## cyl -0.601 0.701 0.277 0.163
## disp -0.637 0.555 0.176 0.500
## hp -0.249 0.721 0.472 0.296
## wt -0.730 0.219 0.417 0.456
## qsec -0.182 -0.897 -0.246
## am 0.891 -0.100
## gear 0.907 0.226
## carb 0.478 0.851
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 3.424 2.603 1.549 0.644
## Proportion Var 0.380 0.289 0.172 0.072
## Cumulative Var 0.380 0.670 0.842 0.913
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 6.06 on 6 degrees of freedom.
## The p-value is 0.416
somewhat related
# DM_05_03.R
# INSTALL AND LOAD PACKAGES ################################
pacman::p_load(ggplot2, grid, gridExtra, robustbase)
# DATA #####################################################
# Import the data
data = read.csv(here::here( "AnomalyData.csv"))
# Structure of the data
str(data)
## 'data.frame': 48 obs. of 30 variables:
## $ State : Factor w/ 48 levels "Alabama","Arizona",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ state_code : Factor w/ 48 levels "AL","AR","AZ",..: 1 3 2 4 5 6 7 8 9 11 ...
## $ data.science : num -1 -0.42 -0.66 1.95 0.34 0.69 0.45 -0.73 -0.27 -0.67 ...
## $ cluster.analysis : num -0.13 -0.73 -0.39 -0.62 0 1.28 2.91 -1.38 -0.57 -1.12 ...
## $ college : num 1.1 -0.1 -0.64 -0.26 -0.61 1.17 -0.46 -0.3 0.25 -0.87 ...
## $ startup : num -0.68 0.11 -0.08 2.02 1.49 0.41 0.14 -0.75 -1.13 1.3 ...
## $ entrepreneur : num 0.15 0.57 0.01 0.46 0.05 0.09 2.74 0.78 1.97 0.21 ...
## $ ceo : num -0.73 0.25 -0.66 1.27 0.33 1.52 0.91 0.36 0.42 -0.59 ...
## $ mortgage : num 1.53 0.95 -0.5 -0.97 1.38 0.51 1.71 0.03 0.25 0.62 ...
## $ nba : num -0.74 0.38 -0.71 1.46 -0.8 0.03 0.37 1.43 0.89 -1.47 ...
## $ nfl : num -1.83 0.68 -1.59 -0.91 1.17 -0.64 1.63 -0.47 -0.53 -0.3 ...
## $ mlb : num -1.3 0.14 -1.24 0.39 -0.51 1.25 0.86 0.06 -0.67 -1.35 ...
## $ fifa : num -0.8 0.1 -0.88 1.94 -0.33 1.58 0.83 1.07 0.09 -0.9 ...
## $ modern.dance : num -1.22 0.3 -1.36 0.33 -0.34 0.59 -0.16 -0.51 -0.81 0.21 ...
## $ prius : num -0.74 1.19 -0.43 3.97 0.24 0.11 0.12 -0.02 -0.49 -0.03 ...
## $ escalade : num 0.1 0.78 0.93 -0.28 0.01 -1.14 -0.61 0.82 1.14 -0.6 ...
## $ subaru : num -1.19 -0.7 -0.79 -0.52 1.37 1.17 -0.1 -1.01 -1.04 1.07 ...
## $ jello : num -0.75 -0.58 -0.19 -1.21 -0.55 -0.78 -0.56 -1.07 -1.03 1.23 ...
## $ bbq : num 1.52 0.36 1.01 1.37 0.63 -1.05 -1.05 -0.18 1.22 0.19 ...
## $ royal.family : num 0.26 -1.06 -0.09 -1.04 -0.8 1.11 0.94 -0.84 -1.26 -0.83 ...
## $ obfuscation : num -0.32 0.38 -0.45 0.7 1.32 0.47 0.11 -0.88 -0.71 1.05 ...
## $ unicorn : num -1.03 0.1 -0.32 -0.38 0.2 0.09 -0.49 -1.1 -1.01 1.8 ...
## $ Extraversion : num 55.5 50.6 49.9 51.4 45.3 57.6 47 60.9 63.2 40.7 ...
## $ Agreeableness : num 52.7 46.6 52.7 49 47.5 38.6 38.8 50.7 60 52.9 ...
## $ Conscientiousness: num 55.5 58.4 41 43.2 58.8 34.2 36.5 62.7 68.8 44.5 ...
## $ Neuroticism : num 48.7 38.1 56.2 39.1 34.3 53.4 62.4 40.8 38 44.2 ...
## $ Openness : num 42.7 54.7 40.3 65 57.9 53.9 42.7 61 56.9 44.7 ...
## $ PsychRegions : int 1 2 1 2 1 3 3 1 1 2 ...
## $ region : int 3 4 3 4 4 1 3 3 3 4 ...
## $ division : int 6 8 7 9 8 1 5 5 5 8 ...
# Transform variables to factors
data$PsychRegions = as.factor(data$PsychRegions)
data$region = as.factor(data$region)
data$division = as.factor(data$division)
# UNIVARIATE OUTLIERS ######################################
# Using boxplots for each variable separately
# data.science
u01 <- qplot(data = data, y = data.science, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab=NULL, ylab = NULL,
main="data.science") +
geom_text(aes(label = ifelse(data.science %in%
boxplot.stats(data.science)$out,
as.character(state_code), "")), hjust = 1.5)
u01
# cluster.analysis
u02 <- qplot(data = data,y = cluster.analysis, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main = "cluster.analysis") +
geom_text(aes(label = ifelse(cluster.analysis %in%
boxplot.stats(cluster.analysis)$out,
as.character(state_code), "")), hjust = 1.5)
u02
# college
u03 <- qplot(data = data, y = college, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="college") +
geom_text(aes(label = ifelse(college %in%
boxplot.stats(college)$out,
as.character(state_code), "")), hjust = 1.5)
u03
# startup
u04 <- qplot(data = data, y = startup, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="startup") +
geom_text(aes(label = ifelse(startup %in%
boxplot.stats(startup)$out,
as.character(state_code), "")), hjust = 1.5)
u04
# entrepreneur
u05 <- qplot(data = data, y = entrepreneur, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="entrepreneur") +
geom_text(aes(label = ifelse(entrepreneur %in%
boxplot.stats(entrepreneur)$out,
as.character(state_code), "")), hjust = 1.5)
u05
# ceo
u06 <- qplot(data = data, y = ceo, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="ceo") +
geom_text(aes(label = ifelse(ceo %in%
boxplot.stats(ceo)$out,
as.character(state_code), "")), hjust = 1.5)
u06
# mortgage
u07 <- qplot(data = data, y = mortgage, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="mortgage") +
geom_text(aes(label = ifelse(mortgage %in%
boxplot.stats(mortgage)$out,
as.character(state_code), "")), hjust = 1.5)
u07
# nba
u08 <- qplot(data = data, y = nba, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="nba") +
geom_text(aes(label = ifelse(nba %in%
boxplot.stats(nba)$out,
as.character(state_code), "")), hjust = 1.5)
u08
# royal.family
u09 <- qplot(data = data, y = royal.family, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="royal.family") +
geom_text(aes(label = ifelse(royal.family %in%
boxplot.stats(royal.family)$out,
as.character(state_code), "")), hjust = 1.5)
u09
# Neuroticism
u10 <- qplot(data = data, y = Neuroticism, x = 1,
geom = "boxplot", outlier.colour = "#E38942",
xlim = c(0, 2), xlab = NULL, ylab = NULL,
main="Neuroticism") +
geom_text(aes(label = ifelse(Neuroticism %in%
boxplot.stats(Neuroticism)$out,
as.character(state_code), "")), hjust = 1.5)
u10
# Plot all 10 together
grid.arrange(u01, u02, u03, u04, u05,
u06, u07, u08, u09, u10,
nrow = 2,
top = "Boxplots: Univariate outliers")
# BIVARIATE OUTLIERS #######################################
# data.science vs cluster.analysis
b1 <- qplot(data = data,
x = data.science,
y = cluster.analysis,
main = "data.science vs cluster.analysis") +
stat_ellipse(level = .99, color = "#E38942") +
geom_text(aes(label =
ifelse((data.science>1.8 | cluster.analysis>1.6),
as.character(state_code), "")),
hjust = 1.5)
b1
# mortgage vs ceo
b2 <- qplot(data = data,
x = mortgage,
y = ceo,
main = "mortgage vs ceo") +
stat_ellipse(level = .99, color = "#E38942") +
geom_text(aes(label =
ifelse(ceo > 2,
as.character(state_code), "")),
hjust = 1.5)
b2
# modern.dance vs Openness
b3 <- qplot(data = data,
x = modern.dance,
y = Openness,
main = "modern.dance vs Openness") +
stat_ellipse(level = .99,color = "#E38942") +
geom_text(aes(label =
ifelse((modern.dance > 2 | Openness < 30),
as.character(state_code),"")),
hjust = 1.5)
b3
# fifa vs nba
b4 <- qplot(data = data,
x = fifa,
y = nba,
main = "fifa vs nba") +
stat_ellipse(level = .99, color = "#E38942") +
geom_text(aes(label =
ifelse(fifa > 2,
as.character(state_code), "")),
hjust = 1.5)
b4
# subaru vs escalade
b5 <- qplot(data = data,
x = subaru,
y = escalade,
main = "subaru vs escalade") +
stat_ellipse(level = .99, color = "#E38942") +
geom_text(aes(label =
ifelse(subaru > 2.5,
as.character(state_code), "")),
hjust = 1.5)
b5
# unicorn vs obsfucation
b6 <- qplot(data = data,
x = unicorn,
y = obfuscation,
main = "unicorn vs obfuscation") +
stat_ellipse(level = .99, color = "#E38942") +
geom_text(aes(label =
ifelse((unicorn > 2 | obfuscation > 2),
as.character(state_code), "")),
hjust = 1.5)
b6
# Conscientiousness vs Extraversion
b7 <- qplot(data = data,
x = Conscientiousness,
y = Extraversion,
main = "Conscientiousness vs Extraversion") +
stat_ellipse(level = .99, color = "#E38942")
b7
# college vs royal.family
b8 <- qplot(data = data,
x = college,
y = royal.family,
main = "college vs royal.family") +
stat_ellipse(level = .99, color = "#E38942")
b8
# Plot all 8 together
grid.arrange(b1, b2, b3, b4, b5, b6, b7, b8,
nrow = 2, top = "Bivariate outliers")
# MULTIVARIATE OUTLIERS ####################################
# Measure overall distance of case from other using both
# Mahalanobis distance and a robust measures of distance.
# Create dataset with only quantitative variables
mcd = covMcd(data[-c(1, 2, 28, 29, 30)])
## Warning in covMcd(data[-c(1, 2, 28, 29, 30)]): n < 2 * p, i.e., possibly
## too small sample size
par(mfrow = c(1, 2))
# Mahalanobis vs. robust distance
plot(mcd,
which = "dd",
labels.id = as.character(data$state_code))
# QQ plot for robust distance
plot(mcd,
which = "qqchi2",
labels.id = as.character(data$state_code))
# DM_06_03.R
# INSTALL AND LOAD PACKAGES ################################
pacman::p_load(arules, arulesViz)
# DATA #####################################################
## Read transactional data from arules package
data("Groceries") # Load data
#?Groceries # Help on data
str(Groceries) # Structure of data
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:43367] 13 60 69 78 14 29 98 24 15 29 ...
## .. .. ..@ p : int [1:9836] 0 4 7 8 12 16 21 22 27 28 ...
## .. .. ..@ Dim : int [1:2] 169 9835
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 169 obs. of 3 variables:
## .. ..$ labels: chr [1:169] "frankfurter" "sausage" "liver loaf" "ham" ...
## .. ..$ level2: Factor w/ 55 levels "baby food","bags",..: 44 44 44 44 44 44 44 42 42 41 ...
## .. ..$ level1: Factor w/ 10 levels "canned food",..: 6 6 6 6 6 6 6 6 6 6 ...
## ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
summary(Groceries) # Includes 5 most frequent items
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels level2 level1
## 1 frankfurter sausage meat and sausage
## 2 sausage sausage meat and sausage
## 3 liver loaf sausage meat and sausage
# RULES ####################################################
# Set minimum support (minSup) to .001
# Set minimum confidence (minConf) to .75
rules <- apriori(Groceries,
parameter = list(supp = 0.001, conf = 0.75))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [777 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
options(digits=2)
inspect(rules[1:10])
## lhs rhs support confidence lift count
## [1] {liquor,
## red/blush wine} => {bottled beer} 0.0019 0.90 11.2 19
## [2] {curd,
## cereals} => {whole milk} 0.0010 0.91 3.6 10
## [3] {root vegetables,
## cereals} => {whole milk} 0.0010 0.77 3.0 10
## [4] {yogurt,
## cereals} => {whole milk} 0.0017 0.81 3.2 17
## [5] {butter,
## jam} => {whole milk} 0.0010 0.83 3.3 10
## [6] {whipped/sour cream,
## instant coffee} => {other vegetables} 0.0010 0.77 4.0 10
## [7] {soups,
## bottled beer} => {whole milk} 0.0011 0.92 3.6 11
## [8] {yogurt,
## Instant food products} => {whole milk} 0.0011 0.79 3.1 11
## [9] {napkins,
## house keeping products} => {whole milk} 0.0013 0.81 3.2 13
## [10] {whipped/sour cream,
## house keeping products} => {whole milk} 0.0012 0.92 3.6 12
# PLOTS ####################################################
# Scatterplot of support x confidence (colored by lift)
plot(rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# Graph of top 20 rules
plot(rules[1:20],
method = "graph",
control = list(type = "items"))
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## main = Graph for 20 rules
## nodeColors = c("#66CC6680", "#9999CC80")
## nodeCol = c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF", "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF", "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## edgeCol = c("#474747FF", "#494949FF", "#4B4B4BFF", "#4D4D4DFF", "#4F4F4FFF", "#515151FF", "#535353FF", "#555555FF", "#575757FF", "#595959FF", "#5B5B5BFF", "#5E5E5EFF", "#606060FF", "#626262FF", "#646464FF", "#666666FF", "#686868FF", "#6A6A6AFF", "#6C6C6CFF", "#6E6E6EFF", "#707070FF", "#727272FF", "#747474FF", "#767676FF", "#787878FF", "#7A7A7AFF", "#7C7C7CFF", "#7E7E7EFF", "#808080FF", "#828282FF", "#848484FF", "#868686FF", "#888888FF", "#8A8A8AFF", "#8C8C8CFF", "#8D8D8DFF", "#8F8F8FFF", "#919191FF", "#939393FF", "#959595FF", "#979797FF", "#999999FF", "#9A9A9AFF", "#9C9C9CFF", "#9E9E9EFF", "#A0A0A0FF", "#A2A2A2FF", "#A3A3A3FF", "#A5A5A5FF", "#A7A7A7FF", "#A9A9A9FF", "#AAAAAAFF", "#ACACACFF", "#AEAEAEFF", "#AFAFAFFF", "#B1B1B1FF", "#B3B3B3FF", "#B4B4B4FF", "#B6B6B6FF", "#B7B7B7FF", "#B9B9B9FF", "#BBBBBBFF", "#BCBCBCFF", "#BEBEBEFF", "#BFBFBFFF", "#C1C1C1FF", "#C2C2C2FF", "#C3C3C4FF", "#C5C5C5FF", "#C6C6C6FF", "#C8C8C8FF", "#C9C9C9FF", "#CACACAFF", "#CCCCCCFF", "#CDCDCDFF", "#CECECEFF", "#CFCFCFFF", "#D1D1D1FF", "#D2D2D2FF", "#D3D3D3FF", "#D4D4D4FF", "#D5D5D5FF", "#D6D6D6FF", "#D7D7D7FF", "#D8D8D8FF", "#D9D9D9FF", "#DADADAFF", "#DBDBDBFF", "#DCDCDCFF", "#DDDDDDFF", "#DEDEDEFF", "#DEDEDEFF", "#DFDFDFFF", "#E0E0E0FF", "#E0E0E0FF", "#E1E1E1FF", "#E1E1E1FF", "#E2E2E2FF", "#E2E2E2FF", "#E2E2E2FF")
## alpha = 0.5
## cex = 1
## itemLabels = TRUE
## labelCol = #000000B3
## measureLabels = FALSE
## precision = 3
## layout = NULL
## layoutParams = list()
## arrowSize = 0.5
## engine = igraph
## plot = TRUE
## plot_options = list()
## max = 100
## verbose = FALSE
# Parallel coordinates plot of top 20 rules
plot(rules[1:20],
method = "paracoord",
control = list(reorder = TRUE))
# Matrix plot of antecedents and consequents
plot(rules[1:20],
method = "matrix",
control = list(reorder = "none")# ,
# measure=c("lift", "confidence")
)
## Itemsets in Antecedent (LHS)
## [1] "{liquor,red/blush wine}"
## [2] "{curd,cereals}"
## [3] "{root vegetables,cereals}"
## [4] "{yogurt,cereals}"
## [5] "{butter,jam}"
## [6] "{whipped/sour cream,instant coffee}"
## [7] "{soups,bottled beer}"
## [8] "{yogurt,Instant food products}"
## [9] "{napkins,house keeping products}"
## [10] "{whipped/sour cream,house keeping products}"
## [11] "{root vegetables,house keeping products}"
## [12] "{pastry,sweet spreads}"
## [13] "{turkey,curd}"
## [14] "{turkey,whipped/sour cream}"
## [15] "{turkey,bottled water}"
## [16] "{turkey,root vegetables}"
## [17] "{rice,sugar}"
## [18] "{frozen vegetables,rice}"
## [19] "{butter,rice}"
## [20] "{domestic eggs,rice}"
## Itemsets in Consequent (RHS)
## [1] "{bottled beer}" "{whole milk}" "{other vegetables}"
# Grouped matrix plot of antecedents and consequents
plot(rules[1:20], method = "grouped")
Sequence mining –> look for chains in the data (if this happens, that is likely to happen)
EXE: - genetic sequencing - recommendation engines - marketing offers - behavioral state switching
ALGORITHMS + Generalized Sequential Patterns + Hidden MArkov model + SPADE, Sequential Pattern Discovery using Equivalence classes.